package com.trytech.mongoocrawler.client.parser.jd;

import com.trytech.mongoocrawler.client.common.queue.UrlFetcherEventProducer;
import com.trytech.mongoocrawler.client.parser.HtmlParser;
import com.trytech.mongoocrawler.client.transport.http.UrlResult;
import com.trytech.mongoocrawler.client.transport.http.WebResult;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.nio.charset.Charset;
import java.util.Iterator;

/**
 * 京东图书列表页解析器
 * @author Collin Chiang
 * @date 2017-04-15
 */
public class JDBookListParser extends HtmlParser<Boolean> {
    @Override
    public Boolean parse(WebResult webResult, UrlFetcherEventProducer urlProducer) {
        try {
            String html = ((WebResult<String>)webResult).getData();
            Document doc = Jsoup.parse(html);
            doc.charset(Charset.forName("UTF-8"));
            Element body = doc.body();
            Element plist = body.getElementById("plist");
            Elements bookListEles = plist.getElementsByClass("gl-item");
            for (Iterator ite = bookListEles.iterator(); ite.hasNext(); ) {
                Element bookEle = (Element) ite.next();
                Element aEle = bookEle.getElementsByTag("a").first();
                String bookUrl = aEle.attr("href");
                if (StringUtils.isNotEmpty(bookUrl)) {
                    urlProducer.sendData(new UrlResult("https:" + bookUrl, new JDBookDetailParser()));
                }
            }
            return true;
        }catch (Exception e){
            e.printStackTrace();
            return false;
        }
    }
}
